{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Practice: scrape Baidu Baike\n",
    "**Here we build a scraper to crawl Baidu Baike from this [page](https://baike.baidu.com/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711) onwards. We store a historical webpage that we have already visited to keep tracking it.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "from urllib.request import urlopen\n",
    "import re\n",
    "import random\n",
    "\n",
    "\n",
    "base_url = \"https://baike.baidu.com\"\n",
    "his = [\"/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Select the last sub url in \"his\", print the title and url.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "网络爬虫     url:  /item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711\n"
     ]
    }
   ],
   "source": [
    "url = base_url + his[-1]\n",
    "\n",
    "html = urlopen(url).read().decode('utf-8')\n",
    "soup = BeautifulSoup(html, features='lxml')\n",
    "print(soup.find('h1').get_text(), '    url: ', his[-1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Find all sub_urls for baidu baike (item page), randomly select a sub_urls and store it in \"his\". If no valid sub link is found, than pop last url in \"his\".**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711', '/item/%E4%B8%8B%E8%BD%BD%E8%80%85']\n"
     ]
    }
   ],
   "source": [
    "# find valid urls\n",
    "sub_urls = soup.find_all(\"a\", {\"target\": \"_blank\", \"href\": re.compile(\"/item/(%.{2})+$\")})\n",
    "\n",
    "if len(sub_urls) != 0:\n",
    "    his.append(random.sample(sub_urls, 1)[0]['href'])\n",
    "else:\n",
    "    # no valid sub link found\n",
    "    his.pop()\n",
    "print(his)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Put everthing together. Random running for 20 iterations. See what we end up with.**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0 网络爬虫     url:  /item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 路由器     url:  /item/%E8%B7%AF%E7%94%B1%E5%99%A8\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2 服务等级     url:  /item/%E6%9C%8D%E5%8A%A1%E7%AD%89%E7%BA%A7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3 呼损率     url:  /item/%E5%91%BC%E6%8D%9F%E7%8E%87\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4 服务等级     url:  /item/%E6%9C%8D%E5%8A%A1%E7%AD%89%E7%BA%A7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "5 呼损率     url:  /item/%E5%91%BC%E6%8D%9F%E7%8E%87\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "6 服务等级     url:  /item/%E6%9C%8D%E5%8A%A1%E7%AD%89%E7%BA%A7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7 呼损率     url:  /item/%E5%91%BC%E6%8D%9F%E7%8E%87\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "8 服务等级     url:  /item/%E6%9C%8D%E5%8A%A1%E7%AD%89%E7%BA%A7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9 呼损率     url:  /item/%E5%91%BC%E6%8D%9F%E7%8E%87\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10 服务等级     url:  /item/%E6%9C%8D%E5%8A%A1%E7%AD%89%E7%BA%A7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "11 呼损率     url:  /item/%E5%91%BC%E6%8D%9F%E7%8E%87\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "12 服务等级     url:  /item/%E6%9C%8D%E5%8A%A1%E7%AD%89%E7%BA%A7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "13 呼损率     url:  /item/%E5%91%BC%E6%8D%9F%E7%8E%87\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "14 服务等级     url:  /item/%E6%9C%8D%E5%8A%A1%E7%AD%89%E7%BA%A7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "15 呼损率     url:  /item/%E5%91%BC%E6%8D%9F%E7%8E%87\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "16 服务等级     url:  /item/%E6%9C%8D%E5%8A%A1%E7%AD%89%E7%BA%A7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "17 呼损率     url:  /item/%E5%91%BC%E6%8D%9F%E7%8E%87\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "18 服务等级     url:  /item/%E6%9C%8D%E5%8A%A1%E7%AD%89%E7%BA%A7\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "19 呼损率     url:  /item/%E5%91%BC%E6%8D%9F%E7%8E%87\n"
     ]
    }
   ],
   "source": [
    "his = [\"/item/%E7%BD%91%E7%BB%9C%E7%88%AC%E8%99%AB/5162711\"]\n",
    "\n",
    "for i in range(20):\n",
    "    url = base_url + his[-1]\n",
    "\n",
    "    html = urlopen(url).read().decode('utf-8')\n",
    "    soup = BeautifulSoup(html, features='lxml')\n",
    "    print(i, soup.find('h1').get_text(), '    url: ', his[-1])\n",
    "\n",
    "    # find valid urls\n",
    "    sub_urls = soup.find_all(\"a\", {\"target\": \"_blank\", \"href\": re.compile(\"/item/(%.{2})+$\")})\n",
    "\n",
    "    if len(sub_urls) != 0:\n",
    "        his.append(random.sample(sub_urls, 1)[0]['href'])\n",
    "    else:\n",
    "        # no valid sub link found\n",
    "        his.pop()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
